In this workshop, you will be introduced to the fundamentals of acquiring and utilizing cancer genomic data. Upon completion of the course, you will be able to produce your own comprehensive R Markdown report for your assigned TCGA ID, featuring plots and identifying impactful mutations and genes from RNAseq.
“Comprehensive Molecular Portraits of Invasive Lobular Breast Cancer”: Ciriello et al, 2015, Cell.
In this workshop, we will primarily use the cBioPortal API to obtain the necessary data. It is therefore important that you have an internet connection and that R, RStudio, as well as the packages listed above, are installed correctly.
Each student should have been assigned a TCGA ID prior to this. Please make sure you have one and that it is one of the IDs listed below.
# Again - please make sure to set to set the working directory
# if you don't do this your script will fail to run.
# Session->set working directory -> to source file location
# load require package and "source" the auxiliary R script design for this workshop.
# library ( cgdsr), cgdsr now obsolete --
library(cBioPortalData)
library ( ggplot2)
library(knitr)
library(kableExtra)
library ( dplyr)
library ( reshape2)
library(VennDiagram)
library(gridExtra)
library (ggrepel)
library ( DT )
library ( ggpubr )
library(clusterProfiler)
library (DOSE)
library( 'org.Hs.eg.db', character.only = TRUE)
# the auxiliary script will help manage some of the more complex/redundant tasks
source("auxi.R")
# initiate cbioportal API
cbiop <- cBioPortal(
hostname = "www.cbioportal.org",
protocol = "https",
api. = "/api/v2/api-docs",
token = character()
)
# Start by getting a list of cancer studies available at cbio
studies <- getStudies(cbiop, buildReport = TRUE)
# This will return a table with a list of available cancer studies to draw from
k2( head ( studies, 10) ) # "example list of available studies on cbioportal"
# note: k2 is one of those cheat codes to make tables print out nicely
# Want to know more about a function? use "?" and the function name
# ? t.test
# ? getStudies
# lets find out how many studies are available to the public
dim ( studies ) # dim is a function to tabulate a dataframe
## [1] 400 15
# Question. How can you find the study you need?
## Answer we can use the grepl function which uses regular expression to search within objects and returns a
## a vector of T and F
head ( grepl("Breast", studies$name, ignore.case = T ), 3)
## [1] TRUE FALSE FALSE
# an index is a position in a vector, eg apple, pear, peach
# so below will give you the positions of all your matches
which ( grepl("Breast", studies$name, ignore.case = T ))
## [1] 1 15 26 30 31 32 33 34 35 36 37 38 39 44 46 47 49 152 290
## [20] 296 342 343 350 351 358
# we can also directly output the studies instead by injecting the index directly
head( studies$name [grepl("Breast", studies$name, ignore.case = T ) ], 3)
## [1] "Adenoid Cystic Carcinoma of the Breast (MSK, J Pathol. 2015)"
## [2] "Breast Fibroepithelial Tumors (Duke-NUS, Nat Genet 2015)"
## [3] "Breast Invasive Carcinoma (Broad, Nature 2012)"
# Question, what is the T for? what if you use lower case instead of upper case
head ( studies$name [grepl("breast", studies$name, ignore.case = F ) ], 3)
## [1] "MAPK on resistance to anti-HER2 therapy for breast cancer (MSK, Nat Commun. 2022)"
## [2] "Proteogenomic landscape of breast cancer (CPTAC, Cell 2020)"
## now try it with T
head ( studies$name [grepl("breast", studies$name, ignore.case = T ) ], 3)
## [1] "Adenoid Cystic Carcinoma of the Breast (MSK, J Pathol. 2015)"
## [2] "Breast Fibroepithelial Tumors (Duke-NUS, Nat Genet 2015)"
## [3] "Breast Invasive Carcinoma (Broad, Nature 2012)"
# According to the syllabus we are looking for TCGA study on Breast Invasive Carcinoma (TCGA, Cell 2015)
studies$name [grepl("Breast Invasive Carcinoma", studies$name, ignore.case = T ) ]
## [1] "Breast Invasive Carcinoma (Broad, Nature 2012)"
## [2] "Breast Invasive Carcinoma (Sanger, Nature 2012)"
## [3] "Breast Invasive Carcinoma (TCGA, Nature 2012)"
## [4] "Breast Invasive Carcinoma (British Columbia, Nature 2012)"
## [5] "Breast Invasive Carcinoma (TCGA, PanCancer Atlas)"
## [6] "Breast Invasive Carcinoma (TCGA, Firehose Legacy)"
## [7] "Breast Invasive Carcinoma (TCGA, Cell 2015)"
# you can subset a dataframe the same way as the vector by indicating which row you want, either with a positional ( index ) or logical vector
## moreover you can select which column you want to display
k2 ( studies[ grepl("Breast Invasive Carcinoma", studies$name, ignore.case = T ) , c("name", "studyId", "citation") ] )
# QUESTION: find the study that we need
# study id is: brca_tcga_pub2015
brca.study = "brca_tcga_pub2015"
# Note: each institution has different ways of organizing their data
## even different versions of the API can be structured differently
# Now that we found the study we can study it a bit.
## list out all the available data sets
mycaselist = sampleLists (cbiop, studyId = brca.study )
k2 ( mycaselist )
## here we see that brca_tcga_pub2015_all is probably the best to use because it includes ALL Complete Tumors
case.list.id = "brca_tcga_pub2015_all"
## Now lets see what samples are in here and to make sure that your assigned TCGA id is in here.
sample_list = samplesInSampleLists(
api = cbiop,
sampleListIds = c ( case.list.id, "brca_tcga_pub2015_erneg", "brca_tcga_pub2015_erpos", "brca_tcga_pub2015_trineg", "brca_tcga_pub2015_her2pos"
)
)
head ( sample_list[[case.list.id]] )
## [1] "TCGA-LQ-A4E4-01" "TCGA-A2-A3KC-01" "TCGA-A2-A3KD-01" "TCGA-A7-A0D9-01"
## [5] "TCGA-A7-A0DA-01" "TCGA-A7-A0CD-01"
# Note how the IDs are structured
## TCGA-LQ-A4E4-01
## project_tissue.source_pid_sampleID
# To match you TCGA ID you must first remove the sampleID
## example
gsub("-[^-]*$", "", 'TCGA-LQ-A4E4-01')
## [1] "TCGA-LQ-A4E4"
# you can use the function apply to loop through the entire list
sample_detail = sample_list # save this for later
sample_list <- lapply(sample_list, function(x) gsub("-[^-]*$", "", x))
mysample = sample_list[[ case.list.id ]]
# Here I chose a random sample to test
myid = "TCGA-C8-A131" # put your assigned id here
# TCGA-D8-A27G
# lets test if our sample is present
mysample[mysample== myid]
## [1] "TCGA-C8-A131"
# here is another way
intersect ( mysample, myid)
## [1] "TCGA-C8-A131"
# yet another
mysample[grepl(myid, mysample)]
## [1] "TCGA-C8-A131"
# lets get clinical some data
myclinicaldata = clinicalData(api = cbiop, studyId = brca.study )
myclinicaldata = myclinicaldata[ myclinicaldata$patientId %in% mysample, ]
# lets get the clinical information for your specific TCGA id.
## lets look through the clinical data and see what info do we have.
## Question can you think of any usage for these data?
DT::datatable( t ( myclinicaldata[myclinicaldata$patientId %in% myid, ] ), options = list (pageLength=20) )
# Question: think about what attributes might be important to your study important fields to consider.
attr1 = c( "sampleId", "AGE","SEX", "AJCC_PATHOLOGIC_TUMOR_STAGE", "DAYS_TO_LAST_FOLLOWUP", "DFS_MONTHS", "DFS_STATUS", "RACE",
"HISTOLOGICAL_DIAGNOSIS", "OS_MONTHS", "OS_STATUS", "TUMOR_STATUS", "DAYS_TO_DEATH", "CANCER_TYPE", "MUTATION_COUNT",
"TUMOR_TISSUE_SITE"
)
DT::datatable( t ( myclinicaldata[myclinicaldata$patientId %in% myid, attr1 ] ), options = list (pageLength=20) )
# Note: think about your experimental designs and what you can do with this.
# Question: now that we have clinical and sample data what can we do with it?
## For example how would you figure out if one of your sample (the assigned TCGA) is a Triple-negative or HER2+.
### Triple-negative breast cancer (TNBC) is characterized by the absence of three common types of receptors known to fuel most breast cancers: estrogen receptors (ER), progesterone receptors (PR), and the human epidermal growth factor receptor 2 (HER2).
# recall our sample list from above
head ( sample_list[[case.list.id ]] )
## [1] "TCGA-LQ-A4E4" "TCGA-A2-A3KC" "TCGA-A2-A3KD" "TCGA-A7-A0D9" "TCGA-A7-A0DA"
## [6] "TCGA-A7-A0CD"
# get samples from other groups
t3.study = "brca_tcga_pub2015_trineg"
t3 = sample_list[[t3.study ]]
her2p.study = "brca_tcga_pub2015_her2pos"
her2p = sample_list[[her2p.study ]]
er_neg.study = "brca_tcga_pub2015_erneg"
er_neg = sample_list[[er_neg.study ]]
er_POS.study = "brca_tcga_pub2015_erpos"
er_POS = sample_list[[er_POS.study ]]
## Answer
t3[t3==myid]
## [1] "TCGA-C8-A131"
her2p[her2p==myid]
## character(0)
er_POS[er_POS==myid]
## character(0)
er_neg[er_neg==myid]
## [1] "TCGA-C8-A131"
# Question: suppose you ask how many are HER2 positive but also ER +?
venn1 = data.frame ( her2p= length ( her2p)
, er_neg= length ( er_POS )
, int= length ( intersect(er_POS,her2p))
)
names ( venn1 )[1:2] = c("Her2+","ER+")
venn.this (
venn1,
type = 2, cp= c("#3C8A9B" , "#9B445D"),
dgg=180
)
# Question, is there a difference in age based on these attributes.
myclinicaldata$AGE = as.numeric ( myclinicaldata$AGE)
# check distribution
hist ( myclinicaldata$AGE )
summary ( myclinicaldata$AGE )
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 26.00 49.00 59.00 58.81 68.00 90.00 1
# get mutation counts
myclinicaldata$MUTATION_COUNT = as.numeric ( myclinicaldata$MUTATION_COUNT )
# Question what can we do if its not normally distributed?
## answer, log transform, sq root, box-cox, et...
## also can use a non-parametric test such as Mann-Whitney U Test
hist ( myclinicaldata$MUTATION_COUNT )
hist ( log2 ( myclinicaldata$MUTATION_COUNT ))
# get age for patients with both
age.both = intersect (er_POS,her2p )
age.both=myclinicaldata[ myclinicaldata$patientId %in% age.both, ]$AGE
age.Her2only.noER = setdiff( her2p, er_POS)
age.Her2only.noER = myclinicaldata[ myclinicaldata$patientId %in% age.Her2only.noER, ]$AGE
temp = rbind (
data.frame ( age = age.both, type= rep("both", length(age.both)) )
,data.frame ( age = age.Her2only.noER, type= rep("Heronly", length(age.Her2only.noER)))
)
median ( age.Her2only.noER )
## [1] 57.5
median ( age.both )
## [1] 61.5
t.test( age.Her2only.noER , age.both )
##
## Welch Two Sample t-test
##
## data: age.Her2only.noER and age.both
## t = -0.82236, df = 62.923, p-value = 0.414
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -7.259628 3.026674
## sample estimates:
## mean of x mean of y
## 58.40625 60.52273
ggplot( temp, aes(x=age, fill=type)) +
geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity') +
scale_fill_manual(values=c("#69b3a2", "#404080"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Question can you think of where there might be difference in age?
cor.test ( log2 ( myclinicaldata$MUTATION_COUNT), myclinicaldata$AGE , use = "complete.obs", method="pearson" )
##
## Pearson's product-moment correlation
##
## data: log2(myclinicaldata$MUTATION_COUNT) and myclinicaldata$AGE
## t = 4.8949, df = 814, p-value = 1.186e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1016447 0.2349996
## sample estimates:
## cor
## 0.169096
cor.test ( myclinicaldata$MUTATION_COUNT, myclinicaldata$AGE , use = "complete.obs", method="spearman" )
##
## Spearman's rank correlation rho
##
## data: myclinicaldata$MUTATION_COUNT and myclinicaldata$AGE
## S = 77168023, p-value = 2.234e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1478446
tempdf = data.frame ( mutations =log2 ( myclinicaldata$MUTATION_COUNT), age = myclinicaldata$AGE )
ggplot(tempdf , aes(x= mutations , y= age)) +
geom_point(position = position_jitter(width = 0.05, height = 0.05), size=7, alpha=.3) +
geom_smooth(method=lm
, se=FALSE
, fullrange=TRUE
)+
theme_classic() + stat_cor(method = "pearson"
, label.x.npc = .65
, label.y.npc = .065
, size=6
)
## `geom_smooth()` using formula = 'y ~ x'
# Question: Why do you think the correlation is so weak?
# BONUS question to take home. As a pharmacologist how can you use DFS and OS? For example
#the histogram above only shows age of onset. However, question what if you hypothesize
#that having HER2 positive is much worse of survival then having both HER2 and ER+?
# Now lets see what information is available for your study
molecular_profile =
molecularProfiles(
cbiop,
studyId = brca.study
)
k2 ( molecular_profile )
# so looking at that we now know it contains several interesting modalities.
mutation = molecular_profile[grepl( "Mutation", molecular_profile$description, ignore.case = T), ]$molecularProfileId
cna = molecular_profile[grepl( "Putative copy", molecular_profile$description, ignore.case = T ), ]$molecularProfileId
exp = molecular_profile[grepl( "V2 RSEM", molecular_profile$description), ]$molecularProfileId
exp = exp[ grepl ( "mrna$", exp)]
# Note: its important to be mindful of resources when downloading from public data. We need to play nice and consider the server load.
# here I've pre-fetched all the data you need. This year we will include all ~20K genes for the entire breast cancer
# cohort for RNAseq.
## example
## note: notice I added -01 for sample since we are now dealing with molecular sample and not just id
## also we are only downloading subset of genes for demonstration only
exps <- molecularData(
api = cbiop,
molecularProfileIds = c( exp ),
entrezGeneIds = 1:1000,
sampleIds = paste0 ( myid, "-01" )
)
mut <- mutationData(
cbiop,
molecularProfileIds = mutation,
entrezGeneIds = 1:10000,
sampleIds = paste0 ( myid, "-01" )
)
k2 ( head ( exps$brca_tcga_pub2015_rna_seq_v2_mrna) )
k2 ( head ( mut$brca_tcga_pub2015_mutations) )
# Here is the entire set
thaw = 1
if ( thaw == 0 ){
# Retrieve a list of all genes
all_genes <- cBioPortalData::geneTable (cbiop, pageSize = 50000)
data_rna = data.frame()
tp = "TCGA-LQ-A4E4-01"
total = length ( sample_detail$brca_tcga_pub2015_all )
for ( tp in sample_detail$brca_tcga_pub2015_all ){
if ( any ( tp %in% colnames(data_rna) ) ){
total = total - 1
print ( paste ( "skipping", tp, " ", total) )
next
}
exps <- molecularData(
api = cbiop,
molecularProfileIds = c( exp ),
entrezGeneIds = all_genes$entrezGeneId,
sampleIds = tp
)
if ( is.null ( exps$brca_tcga_pub2015_rna_seq_v2_mrna ) ){
total = total - 1
print ( paste ( "no data", tp, " ", total) )
next
}
exps = exps$brca_tcga_pub2015_rna_seq_v2_mrna [ , c("entrezGeneId", "value" )]
exps = merge ( exps, all_genes, by.x="entrezGeneId", by.y="entrezGeneId" )
result_vector <- exps %>%
dplyr:: select(hugoGeneSymbol, value) %>%
dplyr:: distinct()
colnames ( result_vector )[2] = tp
if ( nrow (data_rna) == 0 ){
data_rna = result_vector
} else {
data_rna = merge ( data_rna,result_vector, by="hugoGeneSymbol" )
}
total = total -1
print ( paste ( tp , total ) )
Sys.sleep(5)
}
#saveRDS(data_rna, "rnaseq.rds")
mut <- mutationData(
cbiop,
molecularProfileIds = mutation,
entrezGeneIds = all_genes$entrezGeneId,
sampleIds = sample_detail$brca_tcga_pub2015_all
)
mut_df = mut$brca_tcga_pub2015_mutations[ , 3:ncol(mut$brca_tcga_pub2015_mutations ) ]
mut_df = merge ( mut_df, all_genes, by="entrezGeneId" )
#saveRDS(mut_df, "mutation.rds")
saveRDS( list (
studies = studies
,mycaselist = mycaselist
,myclinicaldata =myclinicaldata
,RNAseq = data_rna
,mutation= mut_df
, all_genes = all_genes
, meta = "Downloaded with cBioPortalData_2.12.0 on 02/04/2024 \n study set: brca_tcga_pub2015_all"
), "freeze.rds"
)
}else{
data = readRDS("freeze.rds")
}
# let us "unfreeze" our store data
## note: also this will help preserve your data
## note how I included info on when and how it was downloaded
cat ( data$meta, "\n")
## Downloaded with cBioPortalData_2.12.0 on 02/04/2024
## study set: brca_tcga_pub2015_all
mutations = data$mutation
expression = data$RNAseq
# relevant to cancer.
## genes that had already been implicated in cancer.
## We do this by looking through different organization and cross reference a list of genes.
k2 ( cancer.list[17:25, ], "cancer list")
length ( cancer.gene)
## [1] 1805
## lets check this list to make sure genes that are relevant to breast cancer exists
imp = as.character ( cancer.list[grepl("^BRCA|^ATM$|^BARD1$|^CDH1$|^CHEK2$|^NBN$|^NF1$|^PALB2$|^PTEN$|^TP53$|^PIK3CA$", cancer.list$gene), ]$gene )
# Note: we add ^ in front and $ for only some genes and not others.
k2 ( cancer.list[cancer.list$gene %in% imp, ], "breast cancer genes")
# create coordinates
mutations$igv = paste ( mutations$chr, mutations$startPosition, mutations$endPosition, mutations$referenceAllele, mutations$variantAllele)
# clean up mutations
mutations = mutations[ , c(1,4,9:ncol(mutations))]
dim ( mutations )
## [1] 53847 22
# lets take a few minutes here to go over the different fields.
names ( mutations )
## [1] "entrezGeneId" "patientId" "tumorAltCount" "tumorRefCount"
## [5] "normalAltCount" "normalRefCount" "startPosition" "endPosition"
## [9] "referenceAllele" "proteinChange" "mutationType" "ncbiBuild"
## [13] "variantType" "keyword" "chr" "variantAllele"
## [17] "refseqMrnaId" "proteinPosStart" "proteinPosEnd" "hugoGeneSymbol"
## [21] "type" "igv"
unique( mutations$mutationType)
## [1] "Missense_Mutation" "Nonsense_Mutation" "Frame_Shift_Ins"
## [4] "Nonstop_Mutation" "Splice_Site" "Frame_Shift_Del"
## [7] "Translation_Start_Site" "In_Frame_Ins" "In_Frame_Del"
## [10] "Splice_Region"
#################### BACK to lecture
# based on the lecture previously can you identify the different mutation_type
# list mutations for your sample
nrow ( mutations [ mutations$patientId == myid , ] )
## [1] 51
mt = mutations [ mutations$patientId == myid , ]
mt = mt %>% replace(is.na(.), '') %>% data.frame()
k2 ( mt )
# What are some of things that we can look look for?
# Link to other resources are available.
# Here we use COSMIC a manually curated list of recurrent mutations in cancer and look for keyword breast
## WARNING THIS IS NOT A COMPREHENSIVE LIST. AND IT IS OUTDATED.
## only for demo
k2 ( head ( cosmic.70[ grepl("breast", cosmic.70$description ), ] ), "breast cancer in cosmic")
# note there are are several other databases such as clinvar that we can look for
# lets integrate cosmic into our mutation table
# all coordinates are are based on hg19
# note: all.x = T and all.y=F
mutations = merge ( mutations
, cosmic.70[ , c( "igv", "description")]
, by="igv", all.x=T, all.y=F )
# let us find out how many mutations we have that is also found in cosmic
# the description field should not be empty if there was a cosmic entry
nrow ( mutations[ ! is.na ( mutations$description ), ] ) / nrow (mutations)
## [1] 0.71961
# check if cosmic contain BRCA mutations. This is a good sanity check.
k2 ( head ( mutations[ grepl("^BRCA", mutations$hugoGeneSymbol), ] ) )
# Extra: go to https://cancer.sanger.ac.uk/cosmic to get latest version since this is very outdated.
# lets tabulate the types of mutations
mutation.type = data.frame ( table ( mutations$mutationType))
mutation.type = mutation.type[ order ( mutation.type$Freq), ]
# note: setting the factor level will control the order in which it is plot
mutation.type$Var1 = factor ( mutation.type$Var1, levels = mutation.type$Var1)
ggplot(mutation.type, aes(Var1,Freq), label=Freq ) +
geom_bar(aes(fill = Freq), stat="identity", position = "dodge") +
coord_flip() +
scale_fill_distiller(palette = "RdBu") + xlab("") + ylab("") +
theme(strip.text.y = element_text(angle = 0), legend.position="none") +
geom_text(aes(label=Freq), position=position_dodge(width=0.9), vjust=.4, hjust = .5, size=5) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
text = element_text(size=16), # size of label
axis.text.x = element_text(angle=0, hjust=1) ) + ggtitle ( "All mutation types")
# As mentioned previously one reliable way to look for relevant mutations is to match it against what has been previously observed and/or study. Here we can try to use Cosmic however other organization may be relevant as well including clinvar
cosmic.mutation = mutations [ !is.na(mutations$description), ]
mutation.type = data.frame ( table ( cosmic.mutation$mutationType))
mutation.type = mutation.type[ order ( mutation.type$Freq), ]
mutation.type$Var1 = factor ( mutation.type$Var1, levels = mutation.type$Var1)
ggplot(mutation.type, aes(Var1,Freq), label=Freq ) +
geom_bar(aes(fill = Freq), stat="identity", position = "dodge") +
coord_flip() +
scale_fill_distiller(palette = "RdBu") + xlab("") + ylab("") +
theme(strip.text.y = element_text(angle = 0), legend.position="none") +
geom_text(aes(label=Freq), position=position_dodge(width=0.9), vjust=.4, hjust = .5, size=5) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
text = element_text(size=16), # size of label
axis.text.x = element_text(angle=0, hjust=1) ) + ggtitle ( "Mutation types found in cosmic")
# Question seems like missense mutation is the most recurrent. Can you think of any way to filter the data further
# in the lecture we talked about depth. So lets calculate this.
mutations$dp = mutations$tumorAltCount + mutations$tumorRefCount
seg =quantile ( mutations$dp, probs= c( .01, .2 ))
seg[1]
## 1%
## 9
ggplot(mutations, aes(x=dp)) +
geom_density( color="darkblue", fill="steelblue" ) + geom_vline(xintercept = seg[1], linetype="dotted",
color = "grey", size=1.5) + ggtitle ( " density plot, total depth")
# before
dim ( mutations )
## [1] 53850 24
# after
mutations = mutations[ mutations$dp > as.numeric ( seg[1] ), ]
dim (mutations )
## [1] 53188 24
# another thing we can look at is Allele frequency
# lets calculate allele frequency here as well
mutations$af = mutations$tumorAltCount / mutations$dp
ggplot(mutations, aes(x=af)) +
geom_density( color="darkblue", fill="#e8975a" ) + geom_vline(xintercept = .1, linetype="dotted",
color = "grey", size=1.5) + ggtitle ( " density plot, allele freqeuncy")
# pick the top 10 mutations ranked by af
mutations = mutations[ order ( - mutations$af ) , ]
# lets further filter the mutations with any af > .1
mutations = mutations[ mutations$af > .1, ]
dim (mutations)
## [1] 50935 25
# simpify the output although we probably want to keep it just in case.
cleanmut = c ("hugoGeneSymbol", "mutationType", "proteinChange", "referenceAllele", "variantAllele" , "tumorAltCount", "tumorRefCount" , "dp", "af", "igv", "description")
DT::datatable ( head ( mutations [ , c( cleanmut )]) )
# based on literature these are some of the genes involved in breast cancer.
imp
## [1] "ATM" "BARD1" "BRCA1" "BRCA2" "CDH1" "CHEK2" "NBN" "NF1"
## [9] "PALB2" "PIK3CA" "PTEN" "TP53"
## lets study them and see what types of mutations corresponds most with these genes.
mutation.brca = data.frame ( table ( mutations[ mutations$hugoGeneSymbol %in% imp, ] $mutationType))
mutation.brca = mutation.brca[ order ( mutation.brca$Freq), ]
mutation.brca$Var1 = factor ( mutation.brca$Var1, levels = mutation.brca$Var1)
ggplot(mutation.brca, aes(Var1,Freq), label=Freq ) +
geom_bar(aes(fill = Freq), stat="identity", position = "dodge") +
coord_flip() +
scale_fill_distiller(palette = "RdBu") + xlab("") + ylab("") +
theme(strip.text.y = element_text(angle = 0), legend.position="none") +
geom_text(aes(label=Freq), position=position_dodge(width=0.9), vjust=.4, hjust = .5, size=5) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
text = element_text(size=16), # size of label
axis.text.x = element_text(angle=0, hjust=1) ) + ggtitle ( "Mutation-types in genes associated with breast cancer")
# lets assume that we don't know these genes
# Example: can you think of another way to rediscover these genes and/or find novel ones?
# plot 20 highest recurrent gene mutations
mutation.gene = data.frame ( table ( mutations$hugoGeneSymbol))
mutation.gene = mutation.gene[ order ( - mutation.gene$Freq), ]
mutation.gene$Var1 = factor ( mutation.gene$Var1, levels = rev ( mutation.gene$Var1) )
ggplot(mutation.gene [ 1:20, ] , aes(Var1,Freq), label=Freq ) +
geom_bar(aes(fill = Freq), stat="identity", position = "dodge") +
coord_flip() +
scale_fill_distiller(palette = "RdBu") + xlab("") + ylab("") +
theme(strip.text.y = element_text(angle = 0), legend.position="none") +
geom_text(aes(label=Freq), position=position_dodge(width=0.9), vjust=.4, hjust = .5, size=5) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
text = element_text(size=16), # size of label
axis.text.x = element_text(angle=0, hjust=1) ) + ggtitle ( "Most recurrent genes")
# look into cosmic to see how often these genes appears in breast cancer.
imp.brac = data.frame ( table ( mutations[ grepl( "breast", mutations$description), ]$hugoGeneSymbol ) )
imp.brac$fraction = imp.brac$Freq / nrow ( mutations)
imp.brac = imp.brac [ order ( -imp.brac$fraction) , ]
k2 ( head ( imp.brac, 20), "top cosmic genes found in breast cancer")
# let look at the mutation breakdown
top5 = as.character ( imp.brac$Var1[1:5] )
top5 = mutations[ mutations$hugoGeneSymbol %in% top5, c ( "proteinChange", "hugoGeneSymbol" )]
top5 = data.frame ( table ( top5 ) )
top5 = top5 [ order ( - top5$Freq), ]
# quick hack to add url, will not work with *
k2 ( head ( top5, 20), "top 20 recurrent mutations")
# Exercise: take the first mutation and google it what does it say?
# take the second, what does google say?
# part of studying mutations is look for clinical significance. One way to do this is to match your mutations to actionable targets
mutations$drug = paste ( mutations$hugoGeneSymbol, mutations$proteinChange)
action = drug[ drug$match %in% unique ( mutations$drug ), ]
action.f = c("gene" ,"variant" , "disease", "drugs", "evidence_type",
"clinical_significance", "evidence_statement", "evidence_civic_url", "gene_civic_url" , "match"
)
k2 ( head ( action[action$clinical_significance == "Sensitivity/Response" ,action.f], 2) , "Sensitivity/Response" )
# the opposite could be true
k2 ( head ( action[action$clinical_significance == "Resistance" ,action.f], 2) , "Resistance" )
# which variant are resistant to certain treatment?
k2 ( mutations [ mutations$proteinChange == "L755S", ] , "resistant to Lapatinib ")
row.names ( expression ) = expression$hugoGeneSymbol
expression$hugoGeneSymbol = NULL
expressionc= data.frame ( log2 ( expression + 1 ))
expressionc = na.omit(expressionc)
her2p2 = gsub ( "-", ".", her2p)
her2p2 = paste(her2p2, collapse = "|")
cname = colnames (expressionc )
last_j = which ( grepl ( her2p2, cname ) )
last_b = which ( ! grepl ( her2p2, cname ) )
results = apply (expressionc, 1, function(x) wtest(x))
results = data.frame ( t ( results ))
colnames(results) = c( "pv", "logfc", "her2_ave","base_ave", "all_ave" )
results$fdr = p.adjust(results$pv, method = "fdr", n=nrow ( results))
results = results[ order ( results$fdr), ]
results = na.omit(results)
results$class = ifelse( results$fdr > 0.05 , "no-change",
ifelse(
results$logfc >= .5, "up",
ifelse(
results$logfc <= -.5, "down",
"no-change"
)
) )
# lets check the distribution
table ( results$class)
##
## down no-change up
## 784 18877 352
head ( results[results$class == "up", ], 3)
results = data.frame ( results)
results$gene = row.names ( results )
ggplot(results , aes(logfc, -log10(fdr))) +
geom_point(aes(col=class), size=3, alpha=.6) +
scale_color_manual(values=c("up"="#5EAE64", "no-change"="grey", "down"= "#D55A3D"), breaks=c("up", "down", "no-change"), labels=c(paste0("up in ",'her2',"(",nrow(results [results $class=='up',]),")"), paste0("down in ", 'her2' ,"(",nrow(results[results $class=='down',]),")"), "no-change")) +
ggtitle(paste0( " Volcano plot ")) +theme_bw(base_size = 30) + theme(legend.text=element_text(size=15), legend.position = "bottom") + guides(color = guide_legend(override.aes = list(size=10))) +
geom_text_repel(
data = head ( results , 10) ,
aes(
label = gene
)
,
fontface="bold",
color="black",
size = 6,
nudge_x = .15,
box.padding = .25,
nudge_y = .1,
)
results = results[ order ( -results$logfc), ]
glist = setNames( results$logfc, results$gene)
glist = sort(glist, decreasing = TRUE)
gsea_dot
gsea_ridge
## Picking joint bandwidth of 0.0757
go_1
go_2
samplesnice = gsub ( "-",".", samples) # common issue when converting from dataframe to data matrix.
expressionc[ row.names (expressionc ) %in% c( "TP53" , "KRAS"), samplesnice ]
do.these = head ( names ( glist ) )
plot.out = list()
for ( gene in do.these ){
g1 = melt ( expression[gene, ] )
colnames ( g1 )= c("pid","value")
g1$pid = gsub("-[^-]*$", "", g1$pid)
g1$HER2p = ifelse ( g1$pid %in% her2p, "HER2P", "NO.HER2")
g1$value = log2 ( g1$value + 1 )
mann_whit = wilcox.test(g1[g1$HER2p == "NO.HER2", ]$value,
g1[g1$HER2p != "NO.HER2", ]$value
,paired=FALSE)
p = round ( mann_whit$p.value, 2 )
bon = p * length ( do.these )
gthis = ggplot(g1, aes(y=value, x=HER2p, fill=HER2p)) +
geom_violin()+
geom_jitter(shape=19, position=position_jitter(0.07), size=2 ) +
theme_bw() +
ylab("log2 (tpm + 1 ) ") +
xlab("") +
theme(legend.position="none", legend.title=element_blank(), legend.key = element_blank(),
axis.text.y = element_text(size=12),
axis.text.x = element_text(angle = 45, size=10, hjust=1),
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12),
legend.text =element_text(size=12)
) + stat_summary(fun.y = mean, fun.ymin = mean, fun.ymax = mean,
geom = "crossbar", width = .5) + ggtitle ( gene ) +
scale_fill_manual(values = c("#a6cee3","#b2df8a", "#fdbf6f") ) +
ggtitle ( paste ( gene, "p.value", p, "corr", bon ))
plot.out[[gene]] = gthis
#plot ( gthis )
}
## No id variables; using all as measure variables
## No id variables; using all as measure variables
## No id variables; using all as measure variables
## No id variables; using all as measure variables
## No id variables; using all as measure variables
## No id variables; using all as measure variables
do.call("grid.arrange", c(plot.out, ncol=2))
exp = data.frame ( t ( expressionc ))
exp =exp [ , unique ( c( "ERBB2", names ( exp)))]
temp = cor(exp, exp$ERBB2, method="spearman" )
her2.c = as.numeric(temp )
names ( her2.c) = row.names ( temp )
her2.c = sort ( abs ( her2.c ), decreasing = T )
her2.c = round ( her2.c, 3)
do.these = head ( names ( her2.c ), 7)
do.these = do.these[-1]
plot.out = list()
for ( gene in do.these ){
gthis = ggplot(exp, aes_string (x="ERBB2", y=gene)) +
geom_point(position = position_jitter(width = 0.5, height = 0.5)) +
geom_smooth(method=lm
, se=FALSE
, fullrange=TRUE
)+
theme_classic() + stat_cor(method = "spearman") + ggtitle(gene)
plot.out[[gene]] = gthis
#plot ( gthis )
}
do.call("grid.arrange", c(plot.out, ncol=2))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# what methods can we employ to study a single sample
## how do we compare N=1
scaled.dat <- data.frame ( t ( scale(exp) ))
myid2 = gsub ( "-", ".", myid )
colnames ( scaled.dat) = gsub("\\.[^\\.]*$", "", colnames ( scaled.dat) )
my.dat = scaled.dat[, myid2, drop=T]
names ( my.dat)= row.names (scaled.dat )
my.dat = sort ( abs ( my.dat ), decreasing = T )
my.dat = round ( my.dat, 3)
gene = names ( my.dat[1:10] )
g1 = melt ( as.matrix ( expressionc[ gene, ] ) )
colnames ( g1 )= c("gene","pid","value")
g1 = g1[ order ( g1$value), ]
g1$pid = gsub("\\.[^\\.]*$", "", g1$pid )
ggplot(g1, aes(x=gene, y=value )) +
geom_violin()+
geom_jitter(shape=19, position=position_jitter(0.07), aes( colour=value), size=3, alpha=.3 ) +
scale_fill_brewer(palette = "Set1") +
xlab("") + ylab("") +
geom_text_repel(
data = g1[g1$pid == myid2, ],
aes(
label = pid
)
,
fontface="bold",
color="black",
size = 3,
nudge_x = .15,
box.padding = .25,
nudge_y = .1,
) + ggtitle ( paste ( "Expression of ", gene, "relative to all ALL breast cancer samples" ) ) +
theme_bw() +
theme(legend.position="none", legend.title=element_blank(), legend.key = element_blank(),
axis.text.y = element_text(size=15),
axis.text.x = element_text(angle = 0, size=15),
axis.title.x = element_text(size=22),
axis.title.y = element_text(size=22)
)
# list of "overexpressed" genes for id
head ( my.dat[my.dat> 1.96 ], 3)
## H4C7 GSTA5 FEZF1
## 13.399 9.495 7.137
gene = names ( my.dat[my.dat> 1.96 ] )
gene = gene[ gene %in% cancer.list$gene]
g1 = melt ( as.matrix ( expressionc[ gene[1:12], ] ) )
colnames ( g1 )= c("gene","pid","value")
g1$pid = as.character ( g1$pid)
g1$pid = gsub("\\.[^\\.]*$", "", g1$pid )
g1$group = ifelse ( g1$pid == myid2, myid2, "base")
ccc = setNames( c("red", "grey"), c(myid2, "base"))
ggplot(g1, aes(y=value, x=gene , colour=group )) +
geom_violin()+
geom_jitter(shape=19, position=position_jitter(0.07), size=3 ) +
theme_bw() +
ylab("log2 (tpm + 1 ) ") +
xlab("") +
theme(legend.position="bottom", legend.title=element_blank(), legend.key = element_blank(),
axis.text.y = element_text(size=12),
axis.text.x = element_text(angle = 0, size=11.5),
axis.title.x = element_text(size=22),
axis.title.y = element_text(size=22),
legend.text =element_text(size=12)
) + ggtitle ( paste ( "Top expression gene", myid2 ) ) + scale_colour_manual(values=ccc) +
facet_wrap(gene ~., scales = "free" )
# here is example where I chose, myid = "TCGA-C8-A131-01"
actionid = drug[ drug$match %in% unique ( mutations[mutations$patientId == myid, ] $drug ), ]
actionid = actionid %>% dplyr::group_by(variant) %>%
dplyr::summarise(
gene = paste(unique ( gene ), collapse = "," ) ,
disease = paste(unique ( disease ), collapse = "," ),
clinical_significance = paste(unique ( clinical_significance ), collapse = "," ),
drugs = paste(unique ( drugs ), collapse = "," )
) %>%
data.frame()
k2( actionid, myid)